You can craft a matrix in two ways:
matrix() functiondata.frame() then coercing it with as.matrix()From a vector:
matrix(data = c(1:6), nrow = 2, byrow = T)
## [,1] [,2] [,3] ## [1,] 1 2 3 ## [2,] 4 5 6
You can craft a matrix in two ways:
matrix() functiondata.frame() then coercing it with as.matrix()From a dataframe:
df <- data.frame(x0 = c(1, 1, 1, 1, 1),
x1 = c(7, 4, 7, 6, -1),
x2 = c(-4, -1, -1, 0, 2))
X <- as.matrix(df)
X
## x0 x1 x2 ## [1,] 1 7 -4 ## [2,] 1 4 -1 ## [3,] 1 7 -1 ## [4,] 1 6 0 ## [5,] 1 -1 2
t(X)
## [,1] [,2] [,3] [,4] [,5] ## x0 1 1 1 1 1 ## x1 7 4 7 6 -1 ## x2 -4 -1 -1 0 2
t(X) %*% X
## x0 x1 x2 ## x0 5 23 -4 ## x1 23 151 -41 ## x2 -4 -41 22
solve(t(X) %*% X)
## x0 x1 x2 ## x0 0.9681416 -0.20176991 -0.20000000 ## x1 -0.2017699 0.05545723 0.06666667 ## x2 -0.2000000 0.06666667 0.13333333
Simulate data.
B <- c(-3, .5, 2) sig_sq <- 2.25 epsilon <- rnorm(5, mean = 0, sd = sqrt(sig_sq)) Y <- X %*% B + epsilon
Estimate coefficients.
solve(t(X) %*% X) %*% t(X) %*% Y
## [,1] ## x0 -3.5116653 ## x1 0.7471687 ## x2 2.5824318
df <- data.frame(df, Y) coef(m1 <- lm(Y ~ x1 + x2, data = df))
## (Intercept) x1 x2 ## -3.5116653 0.7471687 2.5824318
First we need to estimate back \(\sigma^2\).
epsilon_hat <- m1$residuals S_sq <- 1/(5 - 2 - 1) * sum(epsilon_hat^2)
Which we can plug into our familiar expression.
var_B <- S_sq * solve(t(X) %*% X) sqrt(diag(var_B))
## x0 x1 x2 ## 1.0664021 0.2552294 0.3957500
summary(m1)$coef
## Estimate Std. Error t value Pr(>|t|) ## (Intercept) -3.5116653 1.0664021 -3.293003 0.08115090 ## x1 0.7471687 0.2552294 2.927440 0.09956480 ## x2 2.5824318 0.3957500 6.525412 0.02268846
There are two general approaches to getting bootstrap intervals for your regression estimates (for \(\beta\), \(\hat{E}(Y|X)\), \([Y|X]\)):
Bootstrap is not a good idea when your you have few observations, so we'll be working with an expanded version of our simulated data set.
set.seed(8134) n <- 35 x0 <- 1 x1 <- rnorm(n) x2 <- rnorm(n) X <- as.matrix(data.frame(x0, x1, x2)) B <- c(-3, .5, 2) sig_sq <- 2.25 epsilon <- rnorm(n, mean = 0, sd = sqrt(sig_sq)) Y <- X %*% B + epsilon df <- data.frame(X, Y)
B1 <- sample_frac(df, replace = TRUE) head(B1)
## x0 x1 x2 Y ## 5 1 -1.47130074 0.97980527 -1.335847 ## 7 1 -0.07552716 -0.21631619 -5.115138 ## 28 1 1.43168857 -0.22640367 -2.349663 ## 14 1 0.69730970 0.04931632 -2.626685 ## 19 1 0.56314232 -0.82051067 -3.462349 ## 31 1 0.43810063 -0.14818633 -2.705051
B2 <- sample_frac(df, replace = TRUE) head(B2)
## x0 x1 x2 Y ## 14 1 0.6973097 0.04931632 -2.626685 ## 31 1 0.4381006 -0.14818633 -2.705051 ## 19 1 0.5631423 -0.82051067 -3.462349 ## 1 1 -0.5478920 -0.51409320 -4.939492 ## 20 1 0.5234250 1.45054512 -1.631597 ## 19.1 1 0.5631423 -0.82051067 -3.462349
mB1 <- lm(Y ~ x1 + x2, data = B1) coef(mB1)
## (Intercept) x1 x2 ## -2.820623 0.362766 1.570162
mB2 <- lm(Y ~ x1 + x2, data = B2) coef(mB2)
## (Intercept) x1 x2 ## -2.8421169 0.7674391 1.5163963
it <- 5000
beta_hp <- rep(NA, it)
for (i in 1:it) {
B <- sample_frac(df, replace = TRUE)
beta_hp[i] <- lm(Y ~ x1 + x2, data = B)$coef[2]
}
sd(beta_hp)
## [1] 0.1988385
m1 <- lm(Y ~ x1 + x2, data = df) summary(m1)$coef
## Estimate Std. Error t value Pr(>|t|) ## (Intercept) -3.0149299 0.2057590 -14.652724 9.591452e-16 ## x1 0.4959268 0.2034211 2.437932 2.050589e-02 ## x2 1.5608675 0.1950372 8.002922 3.894623e-09
It seemed to work fine but it is a bit odd because the bootstrap procedure suggests that both the \(X\) and the \(Y\) are random variables.
Can we devise a procedure that is in closer accordance with our idea of regression as conditioning on the values of \(X\)?
After conditioning on the values of \(X\), \(Y\) is only random through the random vector \(\epsilon\). Why don't we bootstrap that?
it <- 5000
beta_hp <- rep(NA, it)
m1 <- lm(Y ~ x1 + x2, data = df)
B <- df
for (i in 1:it) {
B$Y <- m1$fit + sample(m1$res, replace = TRUE)
beta_hp[i] <- lm(Y ~ x1 + x2, data = B)$coef[2]
}
sd(beta_hp)
## [1] 0.196344
summary(m1)$coef
## Estimate Std. Error t value Pr(>|t|) ## (Intercept) -3.0149299 0.2057590 -14.652724 9.591452e-16 ## x1 0.4959268 0.2034211 2.437932 2.050589e-02 ## x2 1.5608675 0.1950372 8.002922 3.894623e-09
The general form of the multivariate Normal distribution is
\[ \boldsymbol{X} \sim N(\boldsymbol{\mu}, \boldsymbol{\Sigma}) \]
Where \(\boldsymbol{X}\) is a \(n \times p\) matrix, \(\boldsymbol{\mu}\) is a \(p \times 1\) mean vector, and \(\boldsymbol{\Sigma}\) is a \(p \times p\) variance/covariance matrix.
You can access densities and random number generated for this distribution via the dmvnorm() and rmvnorm functions
library(mvtnorm) rmvnorm(n, mean = mu, sigma = Sigma)
First, specify parameters,
mu_vec <- c(1, 2, 3)
sigma_mat <- matrix(c(.5, 0, .7,
0, .5, 0,
.7, 0, 2),
ncol = 3, byrow = TRUE)
sigma_mat
## [,1] [,2] [,3] ## [1,] 0.5 0.0 0.7 ## [2,] 0.0 0.5 0.0 ## [3,] 0.7 0.0 2.0
then, generate random variables.
rmvnorm(1, mean = mu_vec, sigma = sigma_mat)
## [,1] [,2] [,3] ## [1,] 0.2138873 0.4631653 2.079299
rmvnorm(1, mean = mu_vec, sigma = sigma_mat)
## [,1] [,2] [,3] ## [1,] 1.223988 2.113755 3.117293
rmvnorm(10, mean = mu_vec, sigma = sigma_mat)
## [,1] [,2] [,3] ## [1,] 0.48087366 2.8899896 2.889368 ## [2,] 1.29128984 2.5380857 4.171362 ## [3,] 1.34164338 1.5339317 4.110056 ## [4,] 1.31996910 0.5135465 3.566813 ## [5,] 0.01468775 1.2799916 1.300745 ## [6,] 1.01529290 1.9279481 2.973339 ## [7,] 1.38404369 2.8238943 3.733078 ## [8,] 0.68423316 0.5464896 3.220507 ## [9,] 0.61221955 2.0238233 3.004848 ## [10,] 0.16797158 1.5428333 2.185705
X <- rmvnorm(100, mean = mu_vec, sigma = sigma_mat) library(GGally) ggpairs(as.data.frame(X))
We can write a function to calculate the Bernoulli log-likelihood.
l_bern <- function(B, X, Y) {
p <- 1/(1 + exp(- X %*% B))
sum(Y * log(p) + (1 - Y) * log(1 - p))
}
First set the size of the data.
p <- 1 n <- 35
Then generate the \(X\).
library(mvtnorm) X <- cbind(1, rmvnorm(n, mean = rep(0, p), sigma = diag(p)/2)) X
## [,1] [,2] ## [1,] 1 -0.67966050 ## [2,] 1 -0.63762447 ## [3,] 1 -0.40781193 ## [4,] 1 -1.33184998 ## [5,] 1 -0.36875389 ## [6,] 1 0.31413884 ## [7,] 1 0.13188304 ## [8,] 1 0.39427829 ## [9,] 1 -1.30178031 ## [10,] 1 -0.88481574 ## [11,] 1 -0.63412674 ## [12,] 1 0.11924672 ## [13,] 1 -0.37574294 ## [14,] 1 -0.78881965 ## [15,] 1 1.46215759 ## [16,] 1 -0.07894181 ## [17,] 1 -0.28377738 ## [18,] 1 -0.20286877 ## [19,] 1 0.33974093 ## [20,] 1 0.03740265 ## [21,] 1 0.19898271 ## [22,] 1 -0.12334614 ## [23,] 1 0.34488410 ## [24,] 1 0.56970579 ## [25,] 1 0.33629371 ## [26,] 1 -0.18470080 ## [27,] 1 0.14717197 ## [28,] 1 0.20316661 ## [29,] 1 -0.08620660 ## [30,] 1 -0.22954595 ## [31,] 1 -0.50711617 ## [32,] 1 0.10766537 ## [33,] 1 -0.48394873 ## [34,] 1 0.51052127 ## [35,] 1 1.00117094
Then set \(B\).
B <- c(-1, 2)
Finally, generate \(Y\).
Y <- rbinom(n, size = 1, prob = 1/(1 + exp(- X %*% B))) Y
## [1] 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1
Y <- rbinom(n, size = 1, prob = 1/(1 + exp(- X %*% B))) Y
## [1] 0 1 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 1 1
l_bern(B, X, Y)
## [1] -14.98289
l_bern(c(0, 0), X, Y)
## [1] -24.26015
For a whole range of values…
B0 <- seq(-7, 5, length.out = 300)
B1 <- seq(-4, 8, length.out = 300)
l_surface <- matrix(0, nrow = length(B0), ncol = length(B1))
for(i in 1:nrow(l_surface)) {
for(j in 1:ncol(l_surface)) {
l_surface[i, j] <- l_bern(B = c(B0[i], B1[j]), X, Y)
}
}
library(plotly) plot_ly(z = ~l_surface) %>% add_surface(x = B0, y = B1)
You could try your luck with all-purpose optim()…
optim(par = c(0, 0), fn = l_bern, X = X, Y = Y)
## $par ## [1] 14.35824 -16.80262 ## ## $value ## [1] -502.5453 ## ## $counts ## function gradient ## 155 NA ## ## $convergence ## [1] 0 ## ## $message ## NULL
Or look for a dedicated optimizer.
library(maxLik) maxLik(l_bern, start = c(0, 0), X = X, Y = Y)
## Maximum Likelihood estimation ## Newton-Raphson maximisation, 6 iterations ## Return code 1: gradient close to zero ## Log-Likelihood: -14.47858 (2 free parameter(s)) ## Estimate(s): -1.253571 3.134083
df <- data.frame(Y = Y, x1 = X[, 2]) coef(glm(Y ~ x1, data = df, family = "binomial"))
## (Intercept) x1 ## -1.253571 3.134083
What happens when we draw \(n = 350\) instead of \(n = 35\)?
n <- 350
X <- cbind(1, rmvnorm(n, mean = rep(0, p), sigma = diag(p)/2))
Y <- rbinom(n, size = 1, prob = 1/(1 + exp(- X %*% B)))
l_surface <- matrix(0, nrow = length(B0), ncol = length(B1))
for(i in 1:nrow(l_surface)) {
for(j in 1:ncol(l_surface)) {
l_surface[i, j] <- l_bern(B = c(B0[i], B1[j]), X, Y)
}
}
library(plotly) axz <- list(range = c(-350, -150)) plot_ly(z = ~l_surface) %>% add_surface(x = B0, y = B1) %>% layout(scene = list(zaxis = axz))
Any \(\hat{\theta}_{MLE}\) is a function of (random) data, therefore it's a random variable with a distribution. If \(\hat{\theta}_{MLE}\) is a vector then the random vector has a joint distribution.
n <- 35 X <- cbind(1, rmvnorm(n, mean = rep(0, p), sigma = diag(p)/2)) Y <- rbinom(n, size = 1, prob = 1/(1 + exp(- X %*% B))) ml <- maxLik(l_bern, start = c(0, 0), X = X, Y = Y) ml$estimate
## [1] -0.7398946 1.6349121
Y <- rbinom(n, size = 1, prob = 1/(1 + exp(- X %*% B))) ml <- maxLik(l_bern, start = c(0, 0), X = X, Y = Y) ml$estimate
## [1] -1.597595 1.544701
We can fully simulate the joint distribution of \(\hat{\theta}_{MLE}\).
it <- 500
MLE <- matrix(rep(NA, it * (p + 1)), ncol = p + 1)
for (i in 1:it) {
Y <- rbinom(n, size = 1, prob = 1/(1 + exp(- X %*% B)))
ml <- maxLik(l_bern, start = c(0, 0), X = X, Y = Y)
MLE[i, ] <- ml$estimate
}
MLE_35 <- as.data.frame(MLE)
sapply(MLE_35, mean)
## V1 V2 ## -1.091345 2.443953
ggpairs(MLE_35)
Looks a bit skewed and a bit biased. How does this change with sample size?
n <- 350
X <- cbind(1, rmvnorm(n, mean = rep(0, p), sigma = diag(p)/2))
MLE <- matrix(rep(NA, it * (p + 1)), ncol = p + 1)
for (i in 1:it) {
Y <- rbinom(n, size = 1, prob = 1/(1 + exp(- X %*% B)))
ml <- maxLik(l_bern, start = c(0, 0), X = X, Y = Y)
MLE[i, ] <- ml$estimate
}
MLE_350 <- as.data.frame(MLE)
sapply(MLE_350, mean)
## V1 V2 ## -1.006442 2.015724
ggpairs(MLE_350)
Bias seems to be going away, variance is shrinking (consistent?) and it's starting to look MVN…
The bootstrap is primarily useful to estimating the standard error of parameter estimates by plugging in the ecdf for the true distribution function when doing sampling.
Conventional residuals aren't defined for logistic regression, and we can't just bootstrap the y's, so we're left with bootstrapping the cases.
n <- 35
X <- cbind(1, rmvnorm(n, mean = rep(0, p), sigma = diag(p)/2))
Y <- rbinom(n, size = 1, prob = 1/(1 + exp(- X %*% B)))
MLE <- matrix(rep(NA, it * (p + 1)), ncol = p + 1)
for (i in 1:it) {
bs_indices <- sample(1:n, replace = TRUE)
X_bs <- X[bs_indices, ]
Y_bs <- Y[bs_indices]
ml <- maxLik(l_bern, start = c(0, 0), X = X_bs, Y = Y_bs)
MLE[i, ] <- ml$estimate
}
MLE_bs <- as.data.frame(MLE)
ggpairs(MLE_bs)
sapply(MLE_bs, sd)
## V1 V2 ## 0.7548347 1.6133185
df <- data.frame(Y, x1 = X[, 2]) summary(glm(Y ~ x1, data = df, family = "binomial"))$coef[, 2]
## (Intercept) x1 ## 0.797072 1.259880
sapply(MLE_35, sd, na.rm = TRUE)
## V1 V2 ## 0.5245442 1.2665309